Below, the packages required for data analysis and visualization are loaded.
library(tidyverse)
library(magrittr)
library(DBI)
library(dbplyr)
library(RMariaDB)
library(data.table)
library(stopwords)
library(tidytext)
library(RColorBrewer)
library(DT)
library(wordcloud)
W. Edwards Deming said, “In God we trust, all others must bring data.” Below, we will use data to explore the question, “Which are the most valued data science skills?”
con <- DBI::dbConnect(
RMariaDB::MariaDB(),
dbname = "dat_sci_jobs",
username = "root",
password = as.character(read.table("sql_db.txt", header = FALSE)),
host = "35.227.102.234")
tables <- dbListTables(con)
jobs_df <- dbReadTable(con, "_Jobs")
datatable(jobs_df[, -3], options = list(pageLength = 25))
dbDisconnect(con)
completed_files <- readLines("completed_files.txt")
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/"
new_jobs_df <- as.data.frame(matrix(nrow = 0, ncol = 11))
files <- list.files(pattern = "_feeds_.*csv$")
for (i in 1:length(files)){
if (!(files[i] %in% completed_files)){
file <- paste(url_base, files[i], sep = "")
csv <- read.csv(file = file, header = TRUE)
new_jobs_df <- rbind(new_jobs_df, csv)
completed_files <- append(completed_files, files[i])
}
}
new_jobs_df2 <- as.data.frame(matrix(nrow = 0, ncol = 10))
files <- list.files(pattern = "_linkedin_.*csv$")
for (i in 1:length(files)){
if (!(files[i] %in% completed_files)){
file <- paste(url_base, files[i], sep = "")
csv <- read.csv(file = file, header = TRUE)
new_jobs_df2 <- rbind(new_jobs_df2, csv)
completed_files <- append(completed_files, files[i])
}
}
writeLines(completed_files, "completed_files.txt")
new_jobs_df <- new_jobs_df[!duplicated(new_jobs_df), ]
new_jobs_df2 <- new_jobs_df2[!duplicated(new_jobs_df2), ]
if (nrow(new_jobs_df) > 0){
new_jobs_df <- subset(new_jobs_df, select = -c(X, author, summary,
content, extracted_content_url, published, created_at))
cols <- c("Job_id", "Site_id", "Job_title", "Job_url")
colnames(new_jobs_df) <- cols
Job_company <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
ncol = 1))
colnames(Job_company) <- "Job_company"
Job_location <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
ncol = 1))
colnames(Job_location) <- "Job_location"
new_jobs_df <- cbind(new_jobs_df, Job_company, Job_location)
rownames(new_jobs_df) <- NULL
new_jobs_df <- new_jobs_df[c("Job_id", "Job_title", "Job_url",
"Job_company", "Job_location", "Site_id")]
new_jobs_df %<>%
mutate(Job_complete = 0)
}
if (nrow(new_jobs_df2) > 0){
new_jobs_df2 <- subset(new_jobs_df2, select = -c(X, job_url, company_url,
linkedin_company_url_cleaned, posted_date, normalized_company_name))
cols <- c("Job_url", "Job_company", "Job_title", "Job_location")
colnames(new_jobs_df2) <- cols
Job_id <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df2),
ncol = 1))
colnames(Job_id) <- "Job_id"
Site_id <- as.data.frame(matrix(1001, nrow = nrow(new_jobs_df2),
ncol = 1))
colnames(Site_id) <- "Site_id"
new_jobs_df2 <- cbind(new_jobs_df2, Job_id, Site_id)
rownames(new_jobs_df2) <- NULL
new_jobs_df2 <- new_jobs_df2[c("Job_id", "Job_title", "Job_url",
"Job_company", "Job_location", "Site_id")]
new_jobs_df2 %<>%
mutate(Job_complete = 0)
}
if (nrow(new_jobs_df) > 0){
jobs_df <- rbindlist(list(jobs_df, new_jobs_df))[!duplicated(Job_url)]
}
if (nrow(new_jobs_df2) > 0){
jobs_df <- rbindlist(list(jobs_df, new_jobs_df2))[!duplicated(Job_url)]
}
jobs_df %<>%
mutate(Job_id = row_number())
for (i in 1:nrow(jobs_df)){
httr::user_agent("Glen Davis")
if (jobs_df[i, 7] == 0){
dat <- try(xml2::read_html(jobs_df$Job_url[[i]]), silent = TRUE)
if (inherits(dat, "try-error", which = FALSE)){
jobs_df[i, 7] <- -1
next
}
}else{
next
}
if (jobs_df[i, 6] == 2594160){ #ai-jobs.net is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@id, 'job-description')]")
}
else if (jobs_df[i, 6] == 977141){ #python.org is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job-description')]")
}
else if (jobs_df[i, 6] == 2594162){ #careercast it & eng is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'arDetailDescriptionRow')]")
}
else if (jobs_df[i, 6] == 1378327){ #jobs for r-users is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'section_content')]")
}
else if (jobs_df[i, 6] == 2593879){ #Indeed is source
jobs_df[i, 7] <- -1
next
}
else if (jobs_df[i, 6] == 2594166){ #Open Data Science is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job-desc')]")
}
else if (jobs_df[i, 6] == 2594174){ #MLconf is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job_description')]")
}
else if (jobs_df[i, 6] == 1001){ #Linkedin is source
jobs_df[i, 7] <- -1
next
}
desc <- xml2::xml_text(desc)
fn <- paste(jobs_df[i, 1], ".txt", sep = "")
writeLines(desc, con = fn)
jobs_df[i, 7] <- 1
}
manual <- jobs_df %>%
filter(Job_complete == -1 & Site_id == 1001)
write.csv(manual, "manual.csv", row.names = FALSE)
We look up the job descriptions for the job listings in the manual.csv file manually, and we save them as column eight of a manual_edited.csv file. If we find a job description, we change the Job_complete value to 1. If we don’t, we can just delete that row. If a previous file exists, we save over it. We then upload the saved manual_edited.csv file to github before continuing.
file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/manual_edited.csv"
manual_edited <- read.csv(file = file, header = TRUE)
for (i in 1:nrow(manual_edited)){
job_id <- manual_edited[i, 1]
if (jobs_df[job_id, 7] != 1){
job_desc <- manual_edited[i, 8]
jobs_df[job_id, 7] <- manual_edited[i, 7]
fn <- paste(job_id, ".txt", sep = "")
writeLines(job_desc, con = fn)
}
}
completed_files <- readLines("completed_files.txt")
if (!("data_science_jobs_indeed_usa.csv" %in% completed_files)){
file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/data_science_jobs_indeed_usa.csv"
kaggle_indeed <- read.csv(file = file, header = TRUE)
kaggle_indeed <- subset(kaggle_indeed, select = -c(1, 5, 6, 7, 8))
cols <- c("Job_title", "Job_company", "Job_location", "Job_url",
"Job_description")
colnames(kaggle_indeed) <- cols
ids <- seq((nrow(jobs_df) + 1), (nrow(jobs_df) + nrow(kaggle_indeed)))
Job_id <- as.data.frame(matrix(ids, nrow = nrow(kaggle_indeed),
ncol = 1))
Site_id <- as.data.frame(matrix(2593879, nrow = nrow(kaggle_indeed),
ncol = 1))
Job_complete <- as.data.frame(matrix(0, nrow = nrow(kaggle_indeed),
ncol = 1))
colnames(Site_id) <- "Site_id"
colnames(Job_id) <- "Job_id"
colnames(Job_complete) <- "Job_complete"
kaggle_indeed <- cbind(kaggle_indeed, Job_id, Site_id, Job_complete)
rownames(kaggle_indeed) <- NULL
kaggle_indeed <- kaggle_indeed[c("Job_id", "Job_title", "Job_url",
"Job_company", "Job_location",
"Site_id", "Job_complete",
"Job_description")]
jobs_df <- rbind(jobs_df, subset(kaggle_indeed, select = 1:7))
for (i in 1:nrow(kaggle_indeed)){
job_id <- kaggle_indeed[i, 1]
job_desc <- kaggle_indeed[i, 8]
jobs_df[job_id, 7] <- 1
fn <- paste(job_id, ".txt", sep = "")
writeLines(job_desc, con = fn)
}
write("data_science_jobs_indeed_usa.csv", file = "completed_files.txt",
append = TRUE)
}
con <- DBI::dbConnect(
RMariaDB::MariaDB(),
dbname = "dat_sci_jobs",
username = "root",
password = as.character(read.table("sql_db.txt", header = FALSE)),
host = "35.227.102.234")
tables <- dbListTables(con)
copy <- dbReadTable(con, "_Jobs")
if (!identical(jobs_df, copy)){
dbWriteTable(con, "_Jobs", jobs_df, overwrite = TRUE)
}
dbDisconnect(con)
files <- list.files(pattern = "^[1-9]+.*txt$")
if (length(files) > 0){
file.copy(from = paste0(getwd(), "/", files),
to = paste0(getwd(), "/jobs-txt/", files))
file.remove(from = paste0(getwd(), "/", files))
}
files <- list.files(path = paste0(getwd(), "/jobs-txt/"),
pattern = "^[1-9]+.*txt$")
cols <- c("Text", "Job_id", "Line")
completed_txt_files <- readLines("completed_txt_files.txt")
if (length(completed_txt_files) == 0){
text_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
colnames(text_df) <- cols
}else{
my_url <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/text_df.csv"
text_df <- read.csv(file = my_url, header = TRUE)
}
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/jobs-txt/"
for (i in 1:length(files)){
if (!(files[i] %in% completed_txt_files)){
file <- paste(url_base, files[i], sep = "")
job_id <- str_replace(files[i], ".txt", "")
lines <- readLines(file)
for (j in 1:length(lines)){
col2 <- matrix(job_id, nrow = length(lines), ncol = 1)
col3 <- matrix(1:length(lines),
nrow = length(lines),
ncol = 1)
}
addition <- cbind(lines, col2, col3)
colnames(addition) <- cols
text_df <- rbind(text_df, addition)
write(files[i], file = "completed_txt_files.txt", append = TRUE)
}
}
rownames(text_df) <- NULL
write.csv(text_df, "text_df.csv", row.names = FALSE)
tidy_text_df_words <- text_df %>%
unnest_tokens(word, Text)
tidy_text_words_analysis <- tidy_text_df_words %>%
anti_join(get_stopwords()) %>%
group_by(word) %>%
summarize(term_freq = n(), doc_count = n_distinct(Job_id)) %>%
arrange(desc(doc_count))
## Joining with `by = join_by(word)`
datatable(tidy_text_words_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_words_analysis %>%
with(wordcloud(word, doc_count, max.words = 50))
## Warning in wordcloud(word, doc_count, max.words = 50): experience could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): working could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): including could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): business could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): technology could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): analytics could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): ability could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): data could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): engineering could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): environment could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): years could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): building could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): support could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): development could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): related could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): using could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): work could not be fit on
## page. It will not be plotted.
tidy_text_df_bigrams <- text_df %>%
unnest_tokens(bigram, Text, token = "ngrams", n = 2)
tidy_text_bigrams_analysis <- tidy_text_df_bigrams %>%
group_by(bigram) %>%
summarize(term_freq = n(), doc_count = n_distinct(Job_id)) %>%
arrange(desc(doc_count))
datatable(tidy_text_bigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_bigrams_analysis %>%
with(wordcloud(bigram, doc_count, max.words = 50))
## Warning in wordcloud(bigram, doc_count, max.words = 50): years of could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): ability to could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): looking for could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): equal opportunity
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): committed to could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): we are could not be
## fit on page. It will not be plotted.
tidy_text_df_trigrams <- text_df %>%
unnest_tokens(trigram, Text, token = "ngrams", n = 3)
tidy_text_trigrams_analysis <- tidy_text_df_trigrams %>%
group_by(trigram) %>%
summarize(term_freq = n(), doc_count = n_distinct(Job_id)) %>%
arrange(desc(doc_count))
datatable(tidy_text_trigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html